library(readxl)
sm <- read_excel("/Users/user/Downloads/social_media_cleaned.xlsx")
str(sm)
## tibble [21 × 10] (S3: tbl_df/tbl/data.frame)
## $ character : chr [1:21] "masinl" "peace" "Patty" "Bunny" ...
## $ Instagram : num [1:21] 3.5 7.73 3.77 5.38 12 ...
## $ LinkedIn : num [1:21] 4 5.2 7 5.317 0.583 ...
## $ SnapChat : num [1:21] 1 3.683 0.533 1.3 0 ...
## $ Twitter : num [1:21] 5 0 0 0 0.667 ...
## $ Whatsapp/Wechat : num [1:21] 1 4.18 9.83 5.3 3 ...
## $ youtube : num [1:21] 2.5 4.25 1.85 2 3.5 7 3 2 4 3 ...
## $ OTT : num [1:21] 14.5 0 2 2 2 3 0 3 3 0 ...
## $ Reddit : num [1:21] 2.5 0 0 0 1 0 0 0 0 0 ...
## $ How you felt the entire week?: num [1:21] 3 3 4 4 3 5 4 3 3 2 ...
#Dependent Variable: How you felt the entire week? #Independent Variables:Instagram,LinkedIn,SnapChat,Twitter,Whatsapp,youtube,OTT,Reddit
sm1 <- sm[, 2:9]
sm1
## # A tibble: 21 × 8
## Instagram LinkedIn SnapChat Twitter `Whatsapp/Wechat` youtube OTT Reddit
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.5 4 1 5 1 2.5 14.5 2.5
## 2 7.73 5.2 3.68 0 4.18 4.25 0 0
## 3 3.77 7 0.533 0 9.83 1.85 2 0
## 4 5.38 5.32 1.3 0 5.3 2 2 0
## 5 12 0.583 0 0.667 3 3.5 2 1
## 6 2.33 7 0.467 0 12 7 3 0
## 7 5.37 4 0 0 6 3 0 0
## 8 7 4 3 0 10 2 3 0
## 9 8.65 10 3.83 0 6.15 4 3 0
## 10 0.167 0 0 0 1 3 0 0
## # ℹ 11 more rows
#load necessary libraries
library(magrittr)
library(NbClust)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Hierarchical Clustering- Dendrogram
sm_scaled <- scale(sm1)
dist_matrix <- dist(sm_scaled)
#Clustering Single
hc <- hclust(dist_matrix,method = "single")
fviz_dend(hc)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Default Clustering
hc <- hclust(dist_matrix)
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#Average Clustering
hc <- hclust(dist_matrix,method = "average")
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#By observing the above dendrogram’s k=2 clusters will be sufficient.This is confirmed further with D index graphical representation.
num_clusters <- 2
clusters <- cutree(hc, k = num_clusters)
# Membership for each cluster
table(clusters)
## clusters
## 1 2
## 1 20
# Principal Components
pca_result <- prcomp(sm1,scale=TRUE)
pca_result
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Instagram 0.43537868 -0.15356106 0.377944297 0.03258545 -0.36643468
## LinkedIn 0.35649468 -0.20991651 -0.329259391 -0.19907722 0.70255046
## SnapChat 0.15947567 0.03392372 0.717791182 0.10281712 0.51019404
## Twitter -0.39046510 -0.53815939 0.040482193 -0.33406772 -0.09484716
## Whatsapp/Wechat 0.52694693 0.06079220 0.007167203 0.09132133 -0.28308505
## youtube 0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT 0.08162766 -0.68423619 0.216480569 -0.12793328 -0.07632718
## Reddit -0.12361633 -0.35601755 -0.139687974 0.90062217 0.08919336
## PC6 PC7 PC8
## Instagram -0.51184307 -0.48893799 -0.08742222
## LinkedIn 0.02630366 -0.42220705 -0.09096859
## SnapChat -0.04887482 0.39304838 -0.17449187
## Twitter -0.02585534 0.04614556 -0.65794190
## Whatsapp/Wechat 0.68926770 0.02263817 -0.39306576
## youtube -0.39774620 0.64519354 -0.03187491
## OTT 0.31710521 0.06567287 0.59264873
## Reddit -0.02048963 -0.07045776 -0.11831273
#Non-Hierarchical Clustering(k-means)
num_clusters <- 2
kmeans_model <- kmeans(sm1, centers = num_clusters)
# Membership for each cluster
table(kmeans_model$cluster)
##
## 1 2
## 17 4
# Visualize cluster and membership using first two Principal Components
fviz_cluster(list(data = pca_result$x[, 1:2], cluster = kmeans_model$cluster))
# Visualize cluster centers for k-means
fviz_cluster(kmeans_model, data = sm_scaled, geom = "point", frame.type = "convex",
pointsize = 2, fill = "white", main = "K-means Cluster Centers")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type instead.
# Visualize cluster and membership using first two Principal Components for k-means
pca_result <- prcomp(sm_scaled, scale = TRUE)
fviz_cluster(kmeans_model, data = pca_result$x[, 1:2], geom = "point",
pointsize = 2, fill = "white", main = "K-means Clustering Result (PCA)")
# Calculate silhouette information for k-means clustering
sil <- silhouette(kmeans_model$cluster, dist(sm_scaled))
# Visualize the silhouette plot for k-means clustering
fviz_silhouette(sil, main = "Silhouette Plot for K-means Clustering")
## cluster size ave.sil.width
## 1 1 17 0.13
## 2 2 4 0.07
#optimal cluster method/visualization
res.nbclust <- sm1 %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all")
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 4 proposed 8 as the best number of clusters
## * 4 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
#From cluster analysis I am able to recognize users whose social media usage patterns are similar to mine.
#Get the correlations between the variables
cor(sm1, use = "complete.obs")
## Instagram LinkedIn SnapChat Twitter Whatsapp/Wechat
## Instagram 1.00000000 0.097056399 0.28968877 -0.1930565 0.3776962
## LinkedIn 0.09705640 1.000000000 0.02552545 -0.1300685 0.2288356
## SnapChat 0.28968877 0.025525452 1.00000000 -0.1799569 0.0809998
## Twitter -0.19305653 -0.130068464 -0.17995686 1.0000000 -0.4958329
## Whatsapp/Wechat 0.37769615 0.228835623 0.08099980 -0.4958329 1.0000000
## youtube 0.33000187 0.452197669 -0.16006877 -0.1881777 0.3716852
## OTT 0.26738122 0.185492527 0.13158590 0.5570740 0.1336204
## Reddit -0.07461553 -0.006992884 -0.08116237 0.1649030 -0.1344497
## youtube OTT Reddit
## Instagram 0.33000187 0.2673812 -0.074615529
## LinkedIn 0.45219767 0.1854925 -0.006992884
## SnapChat -0.16006877 0.1315859 -0.081162369
## Twitter -0.18817769 0.5570740 0.164902964
## Whatsapp/Wechat 0.37168516 0.1336204 -0.134449660
## youtube 1.00000000 0.1605652 0.026399913
## OTT 0.16056523 1.0000000 0.232791099
## Reddit 0.02639991 0.2327911 1.000000000
#Computing Principal Components
social_pca <- prcomp(sm1,scale=TRUE)
social_pca
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Instagram 0.43537868 -0.15356106 0.377944297 0.03258545 -0.36643468
## LinkedIn 0.35649468 -0.20991651 -0.329259391 -0.19907722 0.70255046
## SnapChat 0.15947567 0.03392372 0.717791182 0.10281712 0.51019404
## Twitter -0.39046510 -0.53815939 0.040482193 -0.33406772 -0.09484716
## Whatsapp/Wechat 0.52694693 0.06079220 0.007167203 0.09132133 -0.28308505
## youtube 0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT 0.08162766 -0.68423619 0.216480569 -0.12793328 -0.07632718
## Reddit -0.12361633 -0.35601755 -0.139687974 0.90062217 0.08919336
## PC6 PC7 PC8
## Instagram -0.51184307 -0.48893799 -0.08742222
## LinkedIn 0.02630366 -0.42220705 -0.09096859
## SnapChat -0.04887482 0.39304838 -0.17449187
## Twitter -0.02585534 0.04614556 -0.65794190
## Whatsapp/Wechat 0.68926770 0.02263817 -0.39306576
## youtube -0.39774620 0.64519354 -0.03187491
## OTT 0.31710521 0.06567287 0.59264873
## Reddit -0.02048963 -0.07045776 -0.11831273
summary(social_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion 0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
## PC8
## Standard deviation 0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion 1.0000
eigen_social<- social_pca$sdev^2
eigen_social
## [1] 2.2310606 1.7064291 1.2776717 0.8806523 0.8113819 0.5921023 0.3695367
## [8] 0.1311653
#From PCA variate representation of each PC, It’s evident that PC1 and PC2 add arround 50% of the to total variance.
#Screeplot
plot(eigen_social, xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")
plot(log(eigen_social), xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.2
res.pca <- PCA(sm_scaled, graph = FALSE)
fviz_eig(res.pca, addlabels = TRUE)
###From the screeplot elbow it is benefial to consider PC1,PC2,PC3,PC4,PC5,PC6 as it covers 93% of total variance.
#Visualization using PC’s
library(FactoMineR)
library("factoextra")
res.pca <- PCA(sm1, graph = FALSE)
fviz_pca_var(res.pca, col.var = "black")
#From the above I can tell that most of my classmates usage timings of the apps Instagram,Whatsapp/Wechat, LinkedIn, Youtube and snapchat are similar. Most probably, twitter and reddit are not used by me and my classmates.
# load library for factor analysis
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#Decide how many Factors are ideal for your dataset?
fa.parallel(sm1)
## Parallel analysis suggests that the number of factors = 0 and the number of components = 0
#Parallel analysis suggests that the number of factors = 0 and the number of components = 0
#Explain the output for your factor model?
fit.pc <- principal(sm1, nfactors=2, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Instagram 0.68 0.01 0.463 0.54 1.0
## LinkedIn 0.59 0.11 0.359 0.64 1.1
## SnapChat 0.22 -0.11 0.059 0.94 1.5
## Twitter -0.36 0.84 0.834 0.17 1.4
## Whatsapp/Wechat 0.73 -0.30 0.626 0.37 1.3
## youtube 0.73 0.07 0.533 0.47 1.0
## OTT 0.37 0.82 0.814 0.19 1.4
## Reddit -0.04 0.50 0.250 0.75 1.0
##
## RC1 RC2
## SS loadings 2.19 1.75
## Proportion Var 0.27 0.22
## Cumulative Var 0.27 0.49
## Proportion Explained 0.56 0.44
## Cumulative Proportion 0.56 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.13
## with the empirical chi square 21.06 with prob < 0.072
##
## Fit based upon off diagonal values = 0.71
#High absolute values (close to 1) indicate a strong relationship between the variable and the factor. #h2 explains how much variance of the variables are explained by the factors. #u2 indicates the amount of variance not explained by the factors Principal Components Analysis Call: principal(r = sm1, nfactors = 2, rotate = “varimax”) Standardized loadings (pattern matrix) based upon correlation matrix
RC1 RC2
SS loadings 2.27 1.80 Proportion Var 0.25 0.20 Cumulative Var 0.25 0.45 Proportion Explained 0.56 0.44 Cumulative Proportion 0.56 1.00
Mean item complexity = 1.3 Test of the hypothesis that 2 components are sufficient.
The root mean square of the residuals (RMSR) is 0.14 with the empirical chi square 29.01 with prob < 0.066
round(fit.pc$values, 3)
## [1] 2.231 1.706 1.278 0.881 0.811 0.592 0.370 0.131
fit.pc$loadings
##
## Loadings:
## RC1 RC2
## Instagram 0.681
## LinkedIn 0.589 0.111
## SnapChat 0.216 -0.110
## Twitter -0.359 0.840
## Whatsapp/Wechat 0.732 -0.300
## youtube 0.727
## OTT 0.371 0.822
## Reddit 0.498
##
## RC1 RC2
## SS loadings 2.189 1.749
## Proportion Var 0.274 0.219
## Cumulative Var 0.274 0.492
# Communalities
fit.pc$communality
## Instagram LinkedIn SnapChat Twitter Whatsapp/Wechat
## 0.46314710 0.35873574 0.05870521 0.83436255 0.62581187
## youtube OTT Reddit
## 0.53256680 0.81378028 0.25038016
# Rotated factor scores, Notice the columns ordering: RC1, RC2
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Instagram 0.68 0.01 0.463 0.54 1.0
## LinkedIn 0.59 0.11 0.359 0.64 1.1
## SnapChat 0.22 -0.11 0.059 0.94 1.5
## Twitter -0.36 0.84 0.834 0.17 1.4
## Whatsapp/Wechat 0.73 -0.30 0.626 0.37 1.3
## youtube 0.73 0.07 0.533 0.47 1.0
## OTT 0.37 0.82 0.814 0.19 1.4
## Reddit -0.04 0.50 0.250 0.75 1.0
##
## RC1 RC2
## SS loadings 2.19 1.75
## Proportion Var 0.27 0.22
## Cumulative Var 0.27 0.49
## Proportion Explained 0.56 0.44
## Cumulative Proportion 0.56 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.13
## with the empirical chi square 21.06 with prob < 0.072
##
## Fit based upon off diagonal values = 0.71
fit.pc$scores
## RC1 RC2
## [1,] -0.49593354 3.81935345
## [2,] 0.37453109 -0.45810128
## [3,] 0.24035060 -0.39724117
## [4,] -0.11821501 -0.28545411
## [5,] -0.05535776 0.18730345
## [6,] 1.33982873 -0.15394397
## [7,] -0.16544326 -0.57370308
## [8,] 0.36239954 -0.37616101
## [9,] 1.27403174 0.04737682
## [10,] -1.47503111 -0.57670717
## [11,] -1.47090422 0.33142379
## [12,] -0.99564293 0.65382335
## [13,] -0.64587802 -0.84729946
## [14,] -0.54403365 -0.04349169
## [15,] -0.05025441 0.89199339
## [16,] 0.46518890 -0.19297427
## [17,] 0.63875357 -0.56313735
## [18,] -0.42630523 -0.44123138
## [19,] 0.50350291 -0.59787800
## [20,] 2.63240375 0.51258979
## [21,] -1.38799166 -0.93654010
fa.plot(fit.pc) # See Correlations within Factors
Show the columns that go into each factor?
fa.diagram(fit.pc) # Visualize the relationship
Perform some visualizations using the factors
#very simple structure visualization
vss(sm1)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
##
## Very Simple Structure
## Call: vss(x = sm1)
## VSS complexity 1 achieves a maximimum of 0.65 with 5 factors
## VSS complexity 2 achieves a maximimum of 0.78 with 7 factors
##
## The Velicer MAP achieves a minimum of 0.07 with 1 factors
## BIC achieves a minimum of -35.54 with 1 factors
## Sample Size adjusted BIC achieves a minimum of 1.7 with 4 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.39 0.00 0.071 20 2.5e+01 0.19 7.0 0.39 0.1 -35.5 26.2 1.0
## 2 0.52 0.64 0.090 13 8.9e+00 0.78 4.2 0.64 0.0 -30.7 9.4 1.2
## 3 0.55 0.69 0.117 7 4.7e+00 0.70 2.9 0.75 0.0 -16.6 5.0 1.5
## 4 0.60 0.74 0.180 2 1.6e+00 0.45 2.1 0.81 0.0 -4.5 1.7 1.5
## 5 0.65 0.78 0.271 -2 6.1e-01 NA 1.4 0.87 NA NA NA 1.3
## 6 0.62 0.78 0.443 -5 3.9e-10 NA 1.2 0.90 NA NA NA 1.5
## 7 0.62 0.78 1.000 -7 0.0e+00 NA 1.2 0.90 NA NA NA 1.5
## 8 0.62 0.78 NA -8 0.0e+00 NA 1.2 0.90 NA NA NA 1.5
## eChisq SRMR eCRMS eBIC
## 1 3.2e+01 1.7e-01 0.196 -28.5
## 2 9.2e+00 8.8e-02 0.130 -30.4
## 3 2.1e+00 4.2e-02 0.084 -19.2
## 4 7.9e-01 2.6e-02 0.097 -5.3
## 5 2.2e-01 1.4e-02 NA NA
## 6 1.7e-10 3.8e-07 NA NA
## 7 2.2e-18 4.3e-11 NA NA
## 8 2.2e-18 4.3e-11 NA NA
Very Simple Structure Call: vss(x = sm1) VSS complexity 1 achieves a maximimum of 0.61 with 6 factors VSS complexity 2 achieves a maximimum of 0.78 with 7 factors
The Velicer MAP achieves a minimum of 0.06 with 1 factors BIC achieves a minimum of -53.17 with 1 factors Sample Size adjusted BIC achieves a minimum of 1.47 with 5 factors
Statistics by number of factors
# Computing Correlation Matrix
corrm.social <- cor(sm1)
corrm.social
## Instagram LinkedIn SnapChat Twitter Whatsapp/Wechat
## Instagram 1.00000000 0.097056399 0.28968877 -0.1930565 0.3776962
## LinkedIn 0.09705640 1.000000000 0.02552545 -0.1300685 0.2288356
## SnapChat 0.28968877 0.025525452 1.00000000 -0.1799569 0.0809998
## Twitter -0.19305653 -0.130068464 -0.17995686 1.0000000 -0.4958329
## Whatsapp/Wechat 0.37769615 0.228835623 0.08099980 -0.4958329 1.0000000
## youtube 0.33000187 0.452197669 -0.16006877 -0.1881777 0.3716852
## OTT 0.26738122 0.185492527 0.13158590 0.5570740 0.1336204
## Reddit -0.07461553 -0.006992884 -0.08116237 0.1649030 -0.1344497
## youtube OTT Reddit
## Instagram 0.33000187 0.2673812 -0.074615529
## LinkedIn 0.45219767 0.1854925 -0.006992884
## SnapChat -0.16006877 0.1315859 -0.081162369
## Twitter -0.18817769 0.5570740 0.164902964
## Whatsapp/Wechat 0.37168516 0.1336204 -0.134449660
## youtube 1.00000000 0.1605652 0.026399913
## OTT 0.16056523 1.0000000 0.232791099
## Reddit 0.02639991 0.2327911 1.000000000
plot(corrm.social)
social_pca <- prcomp(sm1, scale=TRUE)
summary(social_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion 0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
## PC8
## Standard deviation 0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion 1.0000
plot(social_pca)
#Biplot Visualization
biplot(fit.pc)
#I feel factor analysis is not beneficial for the social media data because I observed that we are missing the most part of the uniqueness of these apps by including factors and we are able to capture only a small portion of variances by using factors. #And parallel analysis screeplot indicated that the ideal number of factors for the social media data is zero. #From the component analysis we got similar results to PCA, where the apps like Instagram, whatsapp/wechat, LinkedIn, Youtube, Snapchat usages are a bit similar and high compared to OTT, Twitter and Reddit.
sm2 <- sm[, 2:10]
sm2
## # A tibble: 21 × 9
## Instagram LinkedIn SnapChat Twitter `Whatsapp/Wechat` youtube OTT Reddit
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.5 4 1 5 1 2.5 14.5 2.5
## 2 7.73 5.2 3.68 0 4.18 4.25 0 0
## 3 3.77 7 0.533 0 9.83 1.85 2 0
## 4 5.38 5.32 1.3 0 5.3 2 2 0
## 5 12 0.583 0 0.667 3 3.5 2 1
## 6 2.33 7 0.467 0 12 7 3 0
## 7 5.37 4 0 0 6 3 0 0
## 8 7 4 3 0 10 2 3 0
## 9 8.65 10 3.83 0 6.15 4 3 0
## 10 0.167 0 0 0 1 3 0 0
## # ℹ 11 more rows
## # ℹ 1 more variable: `How you felt the entire week?` <dbl>
# Performing multiple regression on the dataset
fit <- lm(sm2$`How you felt the entire week?` ~ sm2$Instagram+ sm2$LinkedIn + sm2$SnapChat + sm2$Twitter+ sm2$`Whatsapp/Wechat`+ sm2$youtube + sm2$OTT + sm2$Reddit , data=sm2)
#show the results
summary(fit)
##
## Call:
## lm(formula = sm2$`How you felt the entire week?` ~ sm2$Instagram +
## sm2$LinkedIn + sm2$SnapChat + sm2$Twitter + sm2$`Whatsapp/Wechat` +
## sm2$youtube + sm2$OTT + sm2$Reddit, data = sm2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0693 -0.3702 -0.0812 0.4818 1.4647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.68725 0.67260 3.995 0.00178 **
## sm2$Instagram -0.03358 0.06371 -0.527 0.60778
## sm2$LinkedIn 0.12267 0.08573 1.431 0.17799
## sm2$SnapChat 0.04470 0.06202 0.721 0.48488
## sm2$Twitter 0.18243 0.26988 0.676 0.51188
## sm2$`Whatsapp/Wechat` 0.04077 0.06708 0.608 0.55467
## sm2$youtube 0.06912 0.13459 0.514 0.61692
## sm2$OTT -0.08423 0.09193 -0.916 0.37757
## sm2$Reddit -0.02812 0.12209 -0.230 0.82174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8115 on 12 degrees of freedom
## Multiple R-squared: 0.2784, Adjusted R-squared: -0.2026
## F-statistic: 0.5788 on 8 and 12 DF, p-value: 0.7774
#From the above summary we got p-value 0.7774 which indicates the coefficient of the predictor variable associated with this p-value is not statistically significant.
coefficients(fit)
## (Intercept) sm2$Instagram sm2$LinkedIn
## 2.68724656 -0.03357685 0.12266943
## sm2$SnapChat sm2$Twitter sm2$`Whatsapp/Wechat`
## 0.04469741 0.18242992 0.04076703
## sm2$youtube sm2$OTT sm2$Reddit
## 0.06911591 -0.08423071 -0.02811749
###From the above we get information about the dependent variable in equation form y=b0+ b1x1 + b2x2+…+bnxn where intercept b0=2.68, and cofficients b1=-0.033,….
fitted(fit)
## 1 2 3 4 5 6 7 8
## 2.939170 3.694387 3.803577 3.402626 2.645129 4.208769 3.449678 3.370189
## 9 10 11 12 13 14 15 16
## 4.069330 2.929765 3.364427 3.609704 3.518215 3.308767 3.177730 3.535305
## 17 18 19 20 21
## 3.151638 2.908330 3.696886 3.135180 3.081197
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data=sm2, title="Social-Media")
plot(fit, which=1) # Residuals vs Fitted
plot(fit, which=2) # Normal Q-Q plot
residuals <- residuals(fit)
#Plot residuals against fitted values to check for homoscedasticity
plot_resid_fitted <- ggplot() +
geom_point(aes(x = fitted(fit), y = residuals)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals",
title = "Residuals vs Fitted Values Plot") +
theme_minimal()
print(plot_resid_fitted)
#The residual vs. fitted plot is a tool used to evaluate the assumptions
and adequacy of a regression model. It helps to identify whether the
model adequately captures the underlying relationships in the data or if
there are issues that need to be addressed. #The plot shows a pattern of
points around zero, the model is likely not appropriate.
predict.lm(fit, data.frame(Instagram=8, LinkedIn=5, SnapChat=4, Twitter=4,
Whatsapp=4, youtube=8, OTT=3, Reddit=4 ))
## Warning: 'newdata' had 1 row but variables found have 21 rows
## 1 2 3 4 5 6 7 8
## 2.939170 3.694387 3.803577 3.402626 2.645129 4.208769 3.449678 3.370189
## 9 10 11 12 13 14 15 16
## 4.069330 2.929765 3.364427 3.609704 3.518215 3.308767 3.177730 3.535305
## 17 18 19 20 21
## 3.151638 2.908330 3.696886 3.135180 3.081197
#Make predictions using the model
predicted <- predict(fit, newdata = sm2)
#Calculating RMSE by taking the square root of the mean of the squared differences between the actual values and the predicted values (predicted)
rmse <- sqrt(mean((sm2$`How you felt the entire week?` - predicted)^2))
rmse
## [1] 0.6134638
#Low RMSE(0.613) between 0 and 1 indicates that the models predictions are quite accurate, with small deviations from the actual values.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
#Nonlinearity
# component + residual plot
crPlots(fit)
# plot studentized residuals vs. fitted values
library(car)
spreadLevelPlot(fit)
##
## Suggested power transformation: -3.296001